import org.apache.flink.api.common.functions.MapPartitionFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
import java.util.Iterator;
import static java.io.FileDescriptor.out;

public class HashPartition
{


    public static void main(String[] args) throws Exception
    {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        String rootPath = System.getProperty("user.dir");
        DataSet<Tuple2<Integer, String>> in = env.readCsvFile("file://"+rootPath+"/"+"group.csv").types(Integer.class, String.class);

// hash-partition DataSet by String value and apply a MapPartition transformation.

        DataSet<Tuple2<String, String>> out =in.partitionByHash(0).mapPartition(new MapPartitionFunction<Tuple2<Integer, String>, Tuple2<String, String>>()
        {
            @Override
            public void mapPartition(Iterable<Tuple2<Integer, String>> iterable, Collector<Tuple2<String, String>> out) throws Exception
//            {
//               Tuple2<Integer, String> item = iterable.iterator().next();
//                collector.collect(new Tuple2<String, String>(item.f0.toString(),item.f1));

                {
                    String key = null;
                    String sum = "";

//                    下面的代码只是为了跑通和补全官方文档中的例子，并没有多大的实际意义。
                    for (Tuple2< Integer,String> curr : iterable)
                    {
                        key = curr.f0.toString();
                        sum =sum+ curr.f1;
                    }
                    // emit tuple with key and sum
                    out.collect(new Tuple2<>(key, sum));
                }



//            }
        });



        System.out.println("------------------------输出out--------------------------------");
        out.print();

    }




//    关于partitionByHash的作用就是让每个partition的数据体量比较均匀
//    https://www.techopedia.com/definition/31996/hash-partitioning
//    Techopedia explains Hash Partitioning
//    Hash partitioning is a method to separate out information in a randomized way rather than putting the data in the form of groups.
//    This partitioning system can be used efficiently to manage data on a particular platform.
//    However, there are no performance benefits associated with hash partitioning, as it shuffles the data across the table space randomly.
//
//    The partitioning system can be used to efficiently match queries.
//    It makes use of hashing algorithms to distribute the data across the device to space out the load. By this method, the partitions are approximately the same size.
//    The data that can be partitioned is not historical in nature, and thus this method is very easy to use.




}




